In [89]:
!pip install boto3
Requirement already satisfied: boto3 in /usr/local/lib/python3.10/dist-packages (1.28.54)
Requirement already satisfied: botocore<1.32.0,>=1.31.54 in /usr/local/lib/python3.10/dist-packages (from boto3) (1.31.54)
Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from boto3) (1.0.1)
Requirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from boto3) (0.6.2)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.10/dist-packages (from botocore<1.32.0,>=1.31.54->boto3) (2.8.2)
Requirement already satisfied: urllib3<1.27,>=1.25.4 in /usr/local/lib/python3.10/dist-packages (from botocore<1.32.0,>=1.31.54->boto3) (1.26.16)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.32.0,>=1.31.54->boto3) (1.16.0)
In [158]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import boto3
import yaml
import plotly.express as px

from matplotlib.ticker import FuncFormatter

import dtale
In [91]:
def number_formatter(number, pos=None):
    """Convert a number into a human readable format."""
    magnitude = 0
    while abs(number) >= 1000:
        magnitude += 1
        number /= 1000.0
    return '%.1f%s' % (number, ['', 'K', 'M', 'B', 'T', 'Q'][magnitude])
In [92]:
with open("/content/sample_data/credentials.yaml", "r") as f:
    config = yaml.safe_load(f)
In [93]:
def cargar_datos_s3(bucket, bucket_path):
    session = boto3.Session(
        aws_access_key_id = config['s3']['aws_access_key_id'],
        aws_secret_access_key = config['s3']['aws_secret_access_key'],
        aws_session_token = config['s3']['aws_session_token']
    )

    s3 = session.resource('s3')

    obj = s3.Object(bucket, bucket_path).get()['Body'].read()
    dataset = pickle.loads(obj)

    return dataset
In [94]:
session = boto3.Session(
    aws_access_key_id = config['s3']['aws_access_key_id'],
    aws_secret_access_key = config['s3']['aws_secret_access_key'],
    aws_session_token = config['s3']['aws_session_token']
)

s3 = session.client('s3')
In [95]:
bucket = "aplicaciones-cd-1-2" + config['iexe']['matricula']
key = "limpieza/"
In [96]:
bucket_path = s3.list_objects_v2(Bucket=bucket, Prefix=key)['Contents'][-1]['Key']
In [97]:
bucket_path
Out[97]:
'limpieza/datos-limpios-2023-09-17.pkl'
In [98]:
dataset = cargar_datos_s3(bucket, bucket_path)
In [99]:
dataset.head()
Out[99]:
inspection_id dba_name aka_name license_ facility_type risk address city state zip inspection_date inspection_type results latitude longitude location violations
0 67757 DUNKIN DONUTS/BASKIN-ROBBINS DUNKIN DONUTS/BASKIN-ROBBINS 1380279 Restaurant Risk 2 (Medium) 100 W RANDOLPH ST CHICAGO IL 60601 2010-01-04 Tag Removal Pass 41.884586 -87.631010 {'latitude': '41.88458626715456', 'longitude':... NaN
1 104236 TEMPO CAFE TEMPO CAFE 80916 Restaurant Risk 1 (High) 6 E CHESTNUT ST CHICAGO IL 60611 2010-01-04 Canvass Fail 41.898431 -87.628009 {'latitude': '41.89843137207629', 'longitude':... 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
2 67732 WOLCOTT'S TROQUET 1992039 Restaurant Risk 1 (High) 1834 W MONTROSE AVE CHICAGO IL 60613 2010-01-04 License Re-Inspection Pass 41.961606 -87.675967 {'latitude': '41.961605669949854', 'longitude'... NaN
4 67733 WOLCOTT'S TROQUET 1992040 Restaurant Risk 1 (High) 1834 W MONTROSE AVE CHICAGO IL 60613 2010-01-04 License Re-Inspection Pass 41.961606 -87.675967 {'latitude': '41.961605669949854', 'longitude'... NaN
5 52234 Cafe 608 Cafe 608 2013328 Restaurant Risk 1 (High) 608 W BARRY AVE CHICAGO IL 60657 2010-01-04 License Re-Inspection Pass 41.938007 -87.644755 {'latitude': '41.938006880423615', 'longitude'... NaN

texto en negrita### Los 5 establecimientos con más inspecciones del dataset

In [192]:
inspections_by_type_stablishment = dataset.groupby(['facility_type'], as_index=False)['inspection_id']\
.count()\
.rename(columns={'inspection_id': 'count'})\
.sort_values(by="count", ascending=False)\
.head(5)
In [101]:
inspections_by_type_stablishment
Out[101]:
facility_type count
393 Restaurant 174177
220 Grocery Store 32513
412 School 15971
114 Children's Services Facility 5306
45 Bakery 3768

Muestra los 10 establecimientos con más inspecciones agrupados por facility_type.¶

Se puede apreciar que el Restaurant en todos sus tipos es el establecimiento con más inspecciones en todo el dataset¶

In [208]:
top_10= dataset.groupby('facility_type')['inspection_type'].value_counts().nlargest(10)
print(top_10)
facility_type  inspection_type        
Restaurant     Canvass                    91116
               Canvass Re-Inspection      19282
               License                    19239
               Complaint                  19187
Grocery Store  Canvass                    13375
School         Canvass                    12279
Restaurant     Complaint Re-Inspection     7714
               License Re-Inspection       6182
               Short Form Complaint        6129
Grocery Store  License                     5608
Name: inspection_type, dtype: int64
In [209]:
dataset.columns
Out[209]:
Index(['inspection_id', 'dba_name', 'aka_name', 'license_', 'facility_type',
       'risk', 'address', 'city', 'state', 'zip', 'inspection_date',
       'inspection_type', 'results', 'latitude', 'longitude', 'location',
       'violations', 'year_inspection'],
      dtype='object')

Se puede observar que las observaciones del dataset tienen una distribución normal, presentando algunos outlayers que no influyen demasiado para el análisis de los datos.¶

In [103]:
sns.set_theme(style='darkgrid', context='paper') #style= dark, darkgrid,white, whitegrid,thicks  #context= paper, notebook, talk, poster
sns.relplot(data=dataset)
plt.show()

Aquí se muestra un scatterplot de las variables risk y results donde se puede observar que las variables tienen una distribución normal entre ellas¶

In [212]:
sns.scatterplot(x='risk', y='results',  data=dataset)
Out[212]:
<Axes: xlabel='risk', ylabel='results'>

Distribución entre las variables facility_type y risk (agrupado por tipo de riesgo), distribuyendose normalmente en todo el dataset¶

In [107]:
sns.scatterplot(x='facility_type', y='risk', data=dataset);

Gráfico de los 5 establecimientos con más inspecciones¶

In [108]:
graf_bar = sns.barplot(x='facility_type', y='count', data=inspections_by_type_stablishment)
sns.set_theme(style="darkgrid", context="paper")

graf_bar.set_title("Los 5 tipos de establecimiento más inspeccionados")
graf_bar.set_xlabel("Tipo de establecimiento")
graf_bar.set_ylabel("Total de inspecciones")
graf_bar.yaxis.set_major_formatter(FuncFormatter(number_formatter))
graf_bar.set_xticklabels(graf_bar.get_xticklabels(), rotation=90)
Out[108]:
[Text(0, 0, 'Restaurant'),
 Text(1, 0, 'Grocery Store'),
 Text(2, 0, 'School'),
 Text(3, 0, "Children's Services Facility"),
 Text(4, 0, 'Bakery')]

Número de Inspecciones por año¶

In [109]:
dataset['year_inspection'] = dataset.inspection_date.dt.year
In [111]:
inspections_by_year = dataset.groupby(['year_inspection'], as_index=False)['inspection_id']\
.count()\
.rename(columns={'inspection_id': 'count'})\
.sort_values(by="year_inspection")
In [112]:
inspections_by_year
Out[112]:
year_inspection count
0 2010 18002
1 2011 18668
2 2012 18796
3 2013 20893
4 2014 21485
5 2015 20852
6 2016 22747
7 2017 21507
8 2018 17117
9 2019 18971
10 2020 15086
11 2021 15809
12 2022 16866
13 2023 12034

Gráfica de número de inspecciones por año¶

In [214]:
graf_bar = sns.barplot(x="year_inspection", y="count", data=inspections_by_year)
graf_bar.set_title("Número de inspecciones por año")
graf_bar.set_ylabel('# inspecciones')
graf_bar.set_xlabel("Año")
Out[214]:
Text(0.5, 0, 'Año')

Inspecciones que menos se realizan¶

In [120]:
menos = dataset.groupby(['inspection_type'], as_index=False)['inspection_id']\
.count()\
.sort_values(by="inspection_id", ascending=True)\
.rename(columns={'inspection_id': 'count'})\
.head(5)

menos
Out[120]:
inspection_type count
0 1315 license reinspection 1
78 Summer Feeding 1
77 Special Task Force 1
73 Sample Collection 1
71 SMOKING COMPLAINT 1
In [121]:
g = sns.barplot(x="inspection_type", y="count",data=menos)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
sns.set_theme(style="white", context="poster")

Porcentaje de establecimientos con menos inspeciones¶

In [218]:
graf_bar = dataset.inspection_type.value_counts(normalize=True).sort_values(ascending=True).head(5)
graf_bar
Out[218]:
ADDENDUM                                    0.000004
Duplicated                                  0.000004
finish complaint inspection from 5-18-10    0.000004
CLOSE-UP/COMPLAINT REINSPECTION             0.000004
LICENSE                                     0.000004
Name: inspection_type, dtype: float64
In [219]:
#b = pd.DataFrame({'inspection_type': a.index,
 #                'prop': a})
#b

b = pd.DataFrame({'inspection_type': graf_bar.index,
                 'prop': graf_bar})
b
Out[219]:
inspection_type prop
ADDENDUM ADDENDUM 0.000004
Duplicated Duplicated 0.000004
finish complaint inspection from 5-18-10 finish complaint inspection from 5-18-10 0.000004
CLOSE-UP/COMPLAINT REINSPECTION CLOSE-UP/COMPLAINT REINSPECTION 0.000004
LICENSE LICENSE 0.000004
In [124]:
g = sns.barplot(x="inspection_type", y="prop", data=b)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_ylim(0,1)
sns.set_theme(style="darkgrid", context="poster")

Porcentaje de establecimientos tienes por cada tipo de resultado¶

In [227]:
graf_bar = dataset.results.value_counts(normalize=True)*100
graf_bar
Out[227]:
Pass                    51.459822
Fail                    19.507945
Pass w/ Conditions      15.347734
Out of Business          8.485780
No Entry                 3.925697
Not Ready                1.242500
Business Not Located     0.030522
Name: results, dtype: float64

Mapa de inspecciones¶

In [151]:
px.set_mapbox_access_token(open("/content/sample_data/mapbox_token").read())
In [152]:
# @title Texto de título predeterminado
fig = px.scatter_mapbox(dataset, lat="latitude", lon="longitude", hover_name="city", color="results",
                        hover_data=["aka_name", "facility_type"], zoom=9, height=400, opacity=0.3)
fig.show()
  • REPORTE DE HALLAZGOS

Se utilizaron las librerías dtale, missingno y sketch para elaborar el reporte de hallazgos¶

In [ ]:
pip install dtale
In [154]:
import dtale.app as dtale_app
In [155]:
dtale_app.USE_COLAB=True
In [159]:
dtale_app.show(dataset, host='localhost')
/usr/local/lib/python3.10/dist-packages/dtale/views.py:784: FutureWarning:

['aka_name', 'license_', 'facility_type', 'risk', 'city', 'state', 'zip', 'inspection_type', 'violations'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.

Out[159]:
https://3owynk95bez-496ff2e9c6d22116-40000-colab.googleusercontent.com/dtale/main/1
In [160]:
dtale_app.show(dataset)
/usr/local/lib/python3.10/dist-packages/dtale/views.py:784: FutureWarning:

['aka_name', 'license_', 'facility_type', 'risk', 'city', 'state', 'zip', 'inspection_type', 'violations'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.

Out[160]:
https://3owynk95bez-496ff2e9c6d22116-40000-colab.googleusercontent.com/dtale/main/2
In [ ]:
pip install missingno
In [168]:
import missingno as msno

Genera una gráfica de barras con la composición de las variables y número de elementos en cada una del dataset¶

In [170]:
msno.bar(dataset)
Out[170]:
<Axes: >

Genera una matrix de distribución de las variable del dataset, donde se puede apreciar los missing values de cada variable¶

In [171]:
msno.matrix(dataset)
Out[171]:
<Axes: >

Genera un dendrograma que muestra las distancias entre cada par de clases de manera secuencial o tipo arbol.¶

Nos muestra qué clases se encuentran próximas entre sí¶

In [172]:
msno.dendrogram(dataset)
Out[172]:
<Axes: >

Nos muestra un mapa de calor con la relación entre variables¶

In [173]:
msno.heatmap(dataset)
Out[173]:
<Axes: >
In [ ]:

In [177]:
pip install sketch
Collecting sketch
  Downloading sketch-0.4.2-py3-none-any.whl (16 kB)
Requirement already satisfied: pandas>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from sketch) (1.5.3)
Collecting datasketch>=1.5.8 (from sketch)
  Downloading datasketch-1.6.3-py3-none-any.whl (88 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 88.4/88.4 kB 2.5 MB/s eta 0:00:00
Collecting datasketches>=4.0.0 (from sketch)
  Downloading datasketches-4.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (827 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 827.0/827.0 kB 6.4 MB/s eta 0:00:00
Requirement already satisfied: ipython in /usr/local/lib/python3.10/dist-packages (from sketch) (7.34.0)
Collecting lambdaprompt>=0.5.4 (from sketch)
  Downloading lambdaprompt-0.5.5-py3-none-any.whl (14 kB)
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from sketch) (23.1)
Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.10/dist-packages (from datasketch>=1.5.8->sketch) (1.23.5)
Requirement already satisfied: scipy>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from datasketch>=1.5.8->sketch) (1.11.2)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from lambdaprompt>=0.5.4->sketch) (2.31.0)
Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from lambdaprompt>=0.5.4->sketch) (3.8.5)
Collecting python-dotenv (from lambdaprompt>=0.5.4->sketch)
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from lambdaprompt>=0.5.4->sketch) (3.1.2)
Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.10/dist-packages (from lambdaprompt>=0.5.4->sketch) (1.5.7)
Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from lambdaprompt>=0.5.4->sketch) (6.0.1)
Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from lambdaprompt>=0.5.4->sketch) (8.2.3)
Requirement already satisfied: pydantic in /usr/local/lib/python3.10/dist-packages (from lambdaprompt>=0.5.4->sketch) (1.10.12)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.3.0->sketch) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.3.0->sketch) (2023.3.post1)
Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (67.7.2)
Collecting jedi>=0.16 (from ipython->sketch)
  Downloading jedi-0.19.0-py2.py3-none-any.whl (1.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 9.4 MB/s eta 0:00:00
Requirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (4.4.2)
Requirement already satisfied: pickleshare in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (0.7.5)
Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (5.7.1)
Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (3.0.39)
Requirement already satisfied: pygments in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (2.16.1)
Requirement already satisfied: backcall in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (0.2.0)
Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (0.1.6)
Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.10/dist-packages (from ipython->sketch) (4.8.0)
Requirement already satisfied: parso<0.9.0,>=0.8.3 in /usr/local/lib/python3.10/dist-packages (from jedi>=0.16->ipython->sketch) (0.8.3)
Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.10/dist-packages (from pexpect>4.3->ipython->sketch) (0.7.0)
Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->sketch) (0.2.6)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas>=1.3.0->sketch) (1.16.0)
Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->lambdaprompt>=0.5.4->sketch) (23.1.0)
Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->lambdaprompt>=0.5.4->sketch) (3.2.0)
Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->lambdaprompt>=0.5.4->sketch) (6.0.4)
Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->lambdaprompt>=0.5.4->sketch) (4.0.3)
Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->lambdaprompt>=0.5.4->sketch) (1.9.2)
Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->lambdaprompt>=0.5.4->sketch) (1.4.0)
Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->lambdaprompt>=0.5.4->sketch) (1.3.1)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->lambdaprompt>=0.5.4->sketch) (2.1.3)
Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic->lambdaprompt>=0.5.4->sketch) (4.5.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->lambdaprompt>=0.5.4->sketch) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->lambdaprompt>=0.5.4->sketch) (1.26.16)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->lambdaprompt>=0.5.4->sketch) (2023.7.22)
Installing collected packages: python-dotenv, jedi, datasketches, datasketch, lambdaprompt, sketch
Successfully installed datasketch-1.6.3 datasketches-4.1.0 jedi-0.19.0 lambdaprompt-0.5.5 python-dotenv-1.0.0 sketch-0.4.2

Esta librería nos permite utilizar la inteligencia artificial para hacer preguntas relacionadas con el an{alisis que queremos hacer en nuestro dataset¶

In [178]:
import sketch
In [179]:
dataset.sketch.ask('what are the max values of each numerical column?')
index: 259721.0 inspection_id: 2581594.0 latitude: 42.02106424782547 longitude: -87.5250941359867 year_inspection: 2023.0
In [181]:
dataset.sketch.ask('create function generate features for this dataset')
def generate_features(df): # Generate summary statistics summary_stats = df.describe() # Generate descriptive data column_names = df.columns column_types = df.dtypes index_type = df.index.dtype row_count = df.shape[0] column_count = df.shape[1] unique_counts = df.nunique() head_sample = list(df.head(5).values) quantiles = list(df.quantile([0, 0.25, 0.5, 0.75, 1]).values) # Return the generated features return {'summary_stats': summary_stats, 'column_names': column_names, 'column_types': column_types, 'index_type': index_type, 'row_count': row_count, 'column_count': column_count, 'unique_counts': unique_counts, 'head_sample': head_sample, 'quantiles': quantiles}
In [183]:
dataset.sketch.ask('create function ColumnTransformer for this dataset')
from sklearn.compose import ColumnTransformer # Create the column transformer column_trans = ColumnTransformer([('index', 'passthrough', ['index']), ('inspection_id', 'passthrough', ['inspection_id']), ('dba_name', 'passthrough', ['dba_name']), ('aka_name', 'passthrough', ['aka_name']), ('license_', 'passthrough', ['license_']), ('facility_type', 'passthrough', ['facility_type']), ('risk', 'passthrough', ['risk']), ('address', 'passthrough', ['address']), ('city', 'passthrough', ['city']), ('state', 'passthrough', ['state']), ('zip', 'passthrough', ['zip']), ('inspection_date', 'passthrough', ['inspection_date']), ('inspection_type', 'passthrough', ['inspection_type']), ('results', 'passthrough', ['results']), ('latitude', 'passthrough', ['latitude']), ('longitude', 'passthrough', ['longitude']), ('location','passthrough' ,['location']), ('violations','passthrough' ,['violations']), ('year_inspection','passthrough' ,['year_inspection'])])
In [184]:
dataset.sketch.ask('create scatterplot for variable risk and results for this dataset')
import matplotlib.pyplot as plt # Create a scatterplot of risk and results plt.scatter(df['risk'], df['results']) # Label the axes plt.xlabel('Risk') plt.ylabel('Results') # Show the plot plt.show()
In [190]:
plt.plot(dataset['year_inspection'], dataset['results'], color='blue')
plt.show()
In [200]:
dataset.sketch.ask('what is the top 10 inspections of variable facility_type in this dataset?')
Top 10 inspections of variable facility_type in this dataset: 1. Restaurant: 253748 2. Grocery Store: 5863 3. School: 4984 4. Daycare (2 - 6 Years): 3347 5. Daycare (Under 2 Years): 3106 6. Bakery: 2890 7. Nursing Home: 2845 8. Hospital: 2420 9. Long Term Care: 1790 10. Retail Food Establishment: 1617
In [201]:
dataset.sketch.ask('crete code for the top 10 inspections of variable facility_type in this dataset')
top_10_inspections = df.groupby('facility_type')['inspection_id'].value_counts().nlargest(10) print(top_10_inspections)
In [202]:

facility_type        inspection_id
(convenience store)  401296           1
(gas station)        278172           1
                     284419           1
1005 NURSING HOME    353301           1
                     353310           1
                     1396140          1
1023                 462834           1
                     462850           1
                     462858           1
                     1277495          1
Name: inspection_id, dtype: int64
In [203]:
dataset.sketch.ask('crete barplot for top_10 in this dataset')
import matplotlib.pyplot as plt # Get the top 10 values from the dataset top_10 = df['column-name'].value_counts()[:10] # Create a bar plot of the top 10 values plt.bar(top_10.index, top_10.values) plt.xlabel('Column Name') plt.ylabel('Count') plt.title('Top 10 Values in Column Name') plt.show()

Aquí se intento realizar el reporte de hallazgos usando chart_studio, nos muestra que el dataset es muy pesado para correr la librería¶

In [ ]:
pip install chart_studio
Requirement already satisfied: chart_studio in /usr/local/lib/python3.10/dist-packages (1.1.0)
Requirement already satisfied: plotly in /usr/local/lib/python3.10/dist-packages (from chart_studio) (5.15.0)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from chart_studio) (2.31.0)
Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.10/dist-packages (from chart_studio) (1.3.4)
Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from chart_studio) (1.16.0)
Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from plotly->chart_studio) (8.2.3)
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from plotly->chart_studio) (23.1)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->chart_studio) (3.2.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->chart_studio) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->chart_studio) (1.26.16)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->chart_studio) (2023.7.22)
In [ ]:
fig.write_html('hallazgos.html')
In [ ]:
import chart_studio
In [ ]:
username='MEDELLINS'
api_key='K5p9YaEYBy1rQr87V79Y'
chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
In [ ]:
import plotly.graph_objs as sc
In [ ]:
from plotly.graph_objs import Scattergl

El archivo es demasiado grande para generar una grafica con la cuenta DEMO de plotly. PlotlyRequestError: This file is too big! Your current subscription is limited to 524.288 KB uploads. For more information, please visit: https://plotly.com/get-pricing/.¶

Sin embargo la imagen se muestra correctamente en el archivo hallazgos.html guardado en el escritorio de mi computadora. Se adjuntan imagenes.¶

In [ ]:
import chart_studio.plotly as py
py.plot(fig, filename = 'reporte_hallazgos', auto_open=True)
---------------------------------------------------------------------------
PlotlyRequestError                        Traceback (most recent call last)
<ipython-input-137-3d7a0fc566d3> in <cell line: 2>()
      1 import chart_studio.plotly as py
----> 2 py.plot(fig, filename = 'reporte_hallazgos', auto_open=True)

/usr/local/lib/python3.10/dist-packages/chart_studio/plotly/plotly.py in plot(figure_or_data, validate, **plot_options)
    274             grid_filename = filename + "_grid"
    275 
--> 276         grid_ops.upload(
    277             grid=grid,
    278             filename=grid_filename,

/usr/local/lib/python3.10/dist-packages/chart_studio/plotly/plotly.py in upload(cls, grid, filename, world_readable, auto_open, meta)
   1085                 payload["parent_path"] = parent_path
   1086 
-> 1087         file_info = _create_or_overwrite_grid(payload)
   1088 
   1089         cols = file_info["cols"]

/usr/local/lib/python3.10/dist-packages/chart_studio/plotly/plotly.py in _create_or_overwrite_grid(data, max_retries)
   1548     # Create file
   1549     try:
-> 1550         res = api_module.create(data)
   1551     except exceptions.PlotlyRequestError as e:
   1552         if max_retries > 0 and "already exists" in e.message:

/usr/local/lib/python3.10/dist-packages/chart_studio/api/v2/grids.py in create(body)
     16     """
     17     url = build_url(RESOURCE)
---> 18     return request("post", url, json=body)
     19 
     20 

/usr/local/lib/python3.10/dist-packages/retrying.py in wrapped_f(*args, **kw)
     54             @six.wraps(f)
     55             def wrapped_f(*args, **kw):
---> 56                 return Retrying(*dargs, **dkw).call(f, *args, **kw)
     57 
     58             return wrapped_f

/usr/local/lib/python3.10/dist-packages/retrying.py in call(self, fn, *args, **kwargs)
    255 
    256             if not self.should_reject(attempt):
--> 257                 return attempt.get(self._wrap_exception)
    258 
    259             if self._after_attempts:

/usr/local/lib/python3.10/dist-packages/retrying.py in get(self, wrap_exception)
    299                 raise RetryError(self)
    300             else:
--> 301                 six.reraise(self.value[0], self.value[1], self.value[2])
    302         else:
    303             return self.value

/usr/local/lib/python3.10/dist-packages/six.py in reraise(tp, value, tb)
    717             if value.__traceback__ is not tb:
    718                 raise value.with_traceback(tb)
--> 719             raise value
    720         finally:
    721             value = None

/usr/local/lib/python3.10/dist-packages/retrying.py in call(self, fn, *args, **kwargs)
    249 
    250             try:
--> 251                 attempt = Attempt(fn(*args, **kwargs), attempt_number, False)
    252             except:
    253                 tb = sys.exc_info()

/usr/local/lib/python3.10/dist-packages/chart_studio/api/v2/utils.py in request(method, url, **kwargs)
    178         content = response.content if response else "No content"
    179         raise exceptions.PlotlyRequestError(message, status_code, content)
--> 180     validate_response(response)
    181     return response

/usr/local/lib/python3.10/dist-packages/chart_studio/api/v2/utils.py in validate_response(response)
     80         message = content if content else "No Content"
     81 
---> 82     raise exceptions.PlotlyRequestError(message, status_code, content)
     83 
     84 

PlotlyRequestError: This file is too big! Your current subscription is limited to 524.288 KB uploads. For more information, please visit: https://plotly.com/get-pricing/.